#coding=UTF-8
import jieba,requests,re,xlsxwriter,os
from bs4 import BeautifulSoup
from collections import Counter

HEADERS = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
          ,'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' # 客户端能够接收的内容类型
          ,'Accept-Language': 'en-US,en;q=0.5' # 浏览器可接受的语言
          ,'Connection': 'keep-alive' # 表示是否需要持久连接
          }
ARTICLE_URLS = []

def getArticleUrlList():
    a_f = open('../get_pic_from_weixin/articleUrlList.txt','r',encoding='utf-8')
    for l in a_f.readlines():
        ARTICLE_URLS.append(l.replace('\n',''))
    a_f.close()

#访问文章 by url
def requestHtml(_url):
    response = requests.get(_url,headers=HEADERS)
    return response.text


def getRePattern(_express):
    return re.compile(_express)

def getTargetList(_express,_str):
    pattern = getRePattern(_express)
    return pattern.findall(_str)

'''<meta property="og:title" content="请留意身边这样的......">'''
weixin_re = '<meta property="og:title" content="(.+?)" />'
#从html中抽取标题
def getTheTitleOfArticleFromHtml(_html):
    title_list = getTargetList(weixin_re,_html)
    if len(title_list)>0:
        return title_list[0]
    return 'no title'
#从html文本中抽取内容（中文内容）
def getContentFromHtml(_html):
    return BeautifulSoup(_html,'html.parser').text


#统计文本词频
def analyzeArticle(_content):
    seg_list = jieba.cut(_content)
    c = Counter()
    for x in seg_list:
        if len(x)>1 and x != '\r\n':
            c[x] += 1
    #for k,v in c.most_common(20):
    #    print('%s%s %s %d' % (' '*(5-len(k)),k,'*'*int(v/3),v))
    return c

statistic_dict = {}
def packDictTitleWithConstant(_html):
    title = getTheTitleOfArticleFromHtml(_html)
    counter = analyzeArticle(getContentFromHtml(_html))
    statistic_dict[title] = counter
    #return stastic_dict

#创建excel文件
def generateExcel(_statistics_dict):
    f = 'statistics.xlsx'
    workbook = xlsxwriter.Workbook(f)
    worksheet = workbook.add_worksheet('statistics')
    col = 0
    for title,keys in _statistics_dict.items():
        row = 0
        worksheet.write_string(row,col,title)
        for k,v in keys.most_common(30):
            row += 1
            worksheet.write_string(row,col,'%s%s %s %d' % (' '*(5-len(k)),k,'*'*int(v/3),v))
        col += 1
    worksheet.set_column(0,col,65)
    workbook.close()
    os.system('start '+f)


# 遍历文章url 获取文章文字内容 解析文章
if __name__ == '__main__':
    #txt = open('content.txt','r',encoding='utf8').read() 
    #analyzeArticle(txt)
    getArticleUrlList()
    for _u in ARTICLE_URLS:
        html = requestHtml(_u)
        packDictTitleWithConstant(html)
    if False:
        for k,v in statistic_dict.items():
            print("词频统计结果：<<%s>>" % k)
            for vk,vv in v.most_common(20):
                print('%s%s %s %d' % (' '*(5-len(vk)),vk,'*'*int(vv/3),vv))
    generateExcel(statistic_dict)